Random Forest Classification

Use the Random Forest implementation of scikit-learn to perform a 10-fold cross-validation on inbalanced data.


In [1]:
def computeCV(data):
    # http://scikit-learn.org/dev/modules/classes.html#module-sklearn.cross_validation
    from sklearn import ensemble, cross_validation
    clf = ensemble.RandomForestClassifier(n_estimators=1000, n_jobs=5, verbose=0)
    # http://randomforests.wordpress.com/2014/02/02/basics-of-k-fold-cross-validation-and-gridsearchcv-in-scikit-learn/
    res = cross_validation.cross_val_score(clf, data.data, data.target, cv=10, n_jobs = 5)
    print(res)

def dtime_to_seconds(dtime):
    return dtime.seconds + (dtime.microseconds * 1e-6)

def bench(func, data, n=10):
    assert n > 2
    score = np.inf
    try:
        time = []
        for i in range(n):
            score, t = func(*data)
            time.append(dtime_to_seconds(t))
        # remove extremal values
        time.pop(np.argmax(time))
        time.pop(np.argmin(time))
    except Exception as detail:
        print('%s error in function %s: ', (repr(detail), func))
        time = []
    return score, np.array(time)

def bench_skl(X, y, T, valid):
    from sklearn import ensemble #, linear_model
    #from sklearn.utils import safe_asarray
    start = datetime.now()
    
    # balance the dataset
    # https://github.com/scikit-learn/scikit-learn/blob/8dab222cfe894126dfb67832da2f4e871b87bce7/sklearn/preprocessing/_weights.py
    y = np.searchsorted(np.unique(y), y)
    class_weight_bins = np.bincount(y)
    # from class weights to sample weights
    sample_weights = 1. / class_weight_bins.take(y)
    sample_weights *= class_weight_bins.min()
    
    
    # http://scikit-learn.org/stable/modules/classes.html
    clf = ensemble.RandomForestClassifier(n_estimators=1000, n_jobs=5, verbose=0)
    #clf = linear_model.ElasticNet(alpha=0.5, l1_ratio=0.5)
    #clf = linear_model.LogisticRegression()
    #clf = neighbors.NeighborsClassifier(n_neighbors=n_neighbors, algorithm='brute_inplace')
    #clf = skl_cluster.KMeans(k=n_components, n_init=1)
    #...
    clf.fit(X, y, sample_weights)

    ## Regression
    # pred = clf.predict(T)
    # delta = datetime.now() - start
    # mse = np.linalg.norm(pred - valid, 2) ** 2
    # return mse, delta

    # Classification
    score = np.mean(clf.predict(T) == valid)
    return score, datetime.now() - start

def computeAverageFromNRuns(data, num_tries, TH):
    sample_range = np.random.random_sample(size=iris.target.shape[0])
    X = np.array([(iris.data[i,]) for i in range(len(iris.target)) if sample_range[i] >= TH])
    Y = np.array([(iris.target[i,]) for i in range(len(iris.target)) if sample_range[i] >= TH])
    T = np.array([(iris.data[i,]) for i in range(len(iris.target)) if sample_range[i] < TH])
    valid = np.array([(iris.target[i,]) for i in range(len(iris.target)) if sample_range[i] < TH])

    #X, T, y, valid = cross_validation.train_test_split(iris.data, iris.target, test_size=0.9, random_state=0)

    num_tries = 25
    score, times = bench(bench_skl, (X,Y,T,valid), num_tries)
    print('Tries:', num_tries, 'Score:', score, 'Time:', np.mean(times), '(mean)', np.median(times), '(median)')

from sklearn import datasets
import numpy as np
from datetime import datetime
#from sklearn import cross_validation

iris = datasets.load_iris()

computeCV(iris)

runs = 25
TH = 0.9

computeAverageFromNRuns(iris, runs, TH)


[ 1.          0.93333333  1.          0.93333333  0.93333333  0.93333333
  0.86666667  1.          1.          1.        ]
Tries: 25 Score: 0.919708029197 Time: 0.868175434783 (mean) 0.873602 (median)